The transcription factors (TF) and DNA recognitions depend on multiples levels of interactions. The first level involves chromatin accessibility, where nuclesosome-depleted regions are highly associated with TFs binding compared to the closed chromatin, which is often inaccesible to most TFs. The second represents the existence of the consensus binding motif in the DNA sequence, a point that we are going to identify in this notebook in each cell type using Signac and ChromVar.
library(Seurat)
library(Signac)
library(reshape)
library(ggplot2)
library(JASPAR2020)
library(TFBSTools)
library(BSgenome.Hsapiens.UCSC.hg38)
library(patchwork)
library(chromVAR)
library(motifmatchr)
library(ggpubr)
library(data.table)
library(chromVARmotifs)
library(dplyr)
library(purrr)
library(readxl)
library(writexl)
library(pheatmap)
library(factoextra)
library(corrplot)
set.seed(1234)
cell_type = "PC"
# Paths
path_to_obj <- paste0(
here::here("scATAC-seq/results/R_objects/level_5/"),
cell_type,
"/04.",
cell_type,
"_integration_peak_calling_level_5.rds",
sep = ""
)
path_to_save <- paste0(
here::here("scATAC-seq/results/R_objects/level_5/"),
cell_type,
"/05.",
cell_type,
"_chromVar_CISBP_level_5.rds",
sep = ""
)
path_to_save_TF_motifs <- paste0(
here::here("scATAC-seq/results/files/"),
cell_type,
"/",
cell_type,
"_chromVar_CISBP_level_5.xlsx",
sep = ""
)
color_palette <- c("#73787E", "#B8BCC1","#FECFC7" ,"#FF8E7B",
"#a13b53","#A6E1F4", "#586BA4", "#323734",
"#035F72", "#9CC6CF" ,"#198C19","#006600","#FFD8B1")
remove_correlated_helper <- function(mat, val, cutoff = 0.9) {
stopifnot(nrow(mat) == length(val))
cormat <- cor(t(mat), use = "pairwise.complete.obs")
diag(cormat) <- NA
keep <- seq_len(nrow(mat))
for (i in order(val, decreasing = TRUE)) {
if (i %in% keep) {
toremove <- which(cormat[keep, i] >= cutoff)
if (length(toremove) > 0)
keep <- keep[-toremove]
}
}
return(keep)
}
DARS <- function(ident.1,ident.2){
DARs <- FindMarkers(
ident.1 = ident.1,
ident.2 = ident.2,
object = seurat,
min.pct = 0.05,
test.use = 'LR',
latent.vars = 'nCount_peaks_level_5')
DARs_filtered <- DARs[DARs$p_val_adj < 0.05,]
return(DARs_filtered)}
DARS_matrix_ChromVar <- function(DARs, group){
avgexpr_mat <- AverageExpression(
features = row.names(DARs),
seurat,
assays = "chromvar",
return.seurat = F,
group.by = group,
slot = "data")
return(avgexpr_mat)}
seurat <- readRDS(path_to_obj)
seurat
## An object of class Seurat
## 106083 features across 4160 samples within 1 assay
## Active assay: peaks_level_5 (106083 features, 104344 variable features)
## 3 dimensional reductions calculated: umap, lsi, harmony
DimPlot(seurat,
cols = color_palette,
pt.size = 0.8)
Curated collection of human motifs from cisBP database
data("human_pwms_v1")
length(human_pwms_v1)
## [1] 1764
seurat <- AddMotifs(
object = seurat,
genome = BSgenome.Hsapiens.UCSC.hg38,
pfm = human_pwms_v1
)
seurat[["peaks_level_5"]]
## ChromatinAssay data with 106083 features for 4160 cells
## Variable features: 104344
## Genome:
## Annotation present: TRUE
## Motifs present: TRUE
## Fragment files: 9
The TF motif enrichments (that help us to predict potential specific cell-type regulators) previously computed are not calculated per-cell and they do not take into account the insertion sequence bias of the Tn5 transposase. To account for these issues we can use chromVAR that computes for each motif annotation and each cell a bias corrected “desviation” in accessibility from a expected accessibility based on the average of all the cells. This allows us to visualize motif activities per cell, and also provides an alternative method of identifying differentially-active motifs between cell types.
# The RunChromVAR function retrieved the deviationScores, the Z-scores for each bias corrected deviations.
seurat <- RunChromVAR(
object = seurat,
genome = BSgenome.Hsapiens.UCSC.hg38
)
saveRDS(seurat, path_to_save)
avgexpr_mat <- AverageExpression(
seurat,
assays = "chromvar",
return.seurat = F,
group.by = "ident",
slot = "data")
res.pca <- prcomp(t(avgexpr_mat$chromvar),scale. = T)
options(repr.plot.width=6, repr.plot.height=8)
fviz_pca_ind(res.pca,
repel = TRUE)
pheatmap(avgexpr_mat$chromvar, scale = "row",
border_color = "black",
cluster_rows = T,
cluster_cols = T,
fontsize_row= 3,
clustering_distance_rows = "euclidean",
clustering_distance_cols = "euclidean",
clustering_method = "ward.D2",
cutree_rows = NA,
cutree_cols = 2)
We compute the standar desviation for each annotated motif. Then, we select motifs that have a standar desviation higher than a specific theshold and use them to perform a correlation test and a principal component analysis. Note, that we are going to use the function “remove_correlated_helper”, to eliminate the variables that present a correlation greater than 0.9.
seurat_average <- AverageExpression(
seurat,
assays = "chromvar",
return.seurat = T,
group.by = "ident")
threshold = 1
matrix <- seurat_average[["chromvar"]]@data
vars <- matrixStats::rowSds(matrix, na_rm = TRUE)
boxplot(vars)
abline(h=1)
ix <- which(vars >= threshold)
ix2 <- ix[remove_correlated_helper(matrix[ix, , drop = FALSE],
vars[ix])]
cormat <- cor(matrix[ix2,],
use = "pairwise.complete.obs")
corrplot(cormat, type="upper", order="hclust", tl.col="black", tl.srt=45)
pc_res <- prcomp(t(matrix[ix2, ]))
fviz_pca_ind(pc_res, repel = TRUE)
human_pwms_v1[grep(pattern = "SIX4",x = name(human_pwms_v1))]
## PWMatrixList of length 1
## names(1): ENSG00000100625_LINE2190_SIX4_I
FeaturePlot(
object = seurat,
features = c("ENSG00000100219-LINE344-XBP1-D-N4",
"ENSG00000100219-LINE343-XBP1-D-N1",
"ENSG00000057657-LINE611-PRDM1-D-N4",
"ENSG00000137265-LINE2747-IRF4-D-N1",
"ENSG00000137265-LINE2748-IRF4-D-N1",
"ENSG00000213999-LINE2814-MEF2B-D",
"ENSG00000177045-LINE2505-SIX5-D-N1",
"ENSG00000177045-LINE2506-SIX5-D-N1",
"ENSG00000100625-LINE2190-SIX4-I"),
min.cutoff = 'q5',
max.cutoff = 'q95',
pt.size = 0.1,
ncol = 3
)
DefaultAssay(seurat) <- 'chromvar'
step1_ChromVar <- DARS(ident.1 = "Light Zone GCBC",
ident.2 = "PC committed Light Zone GCBC")
step2_ChromVar <- DARS(ident.1 = "PC committed Light Zone GCBC",
ident.2 = "IgG+ PC precursor")
step3_ChromVar <- DARS(ident.1 = "IgG+ PC precursor",
ident.2 = "Mature PC")
step4_ChromVar <- DARS(ident.1 = "class switch MBC",
ident.2 = "Mature PC")
up_selection1 <- row.names(step1_ChromVar %>% top_n(5, avg_log2FC))
down_selection1 <- row.names(step1_ChromVar %>% top_n(5, -avg_log2FC))
up_selection2 <- row.names(step2_ChromVar %>% top_n(3, avg_log2FC))
down_selection2 <- row.names(step2_ChromVar %>% top_n(3, -avg_log2FC))
up_selection3 <- row.names(step3_ChromVar %>% top_n(3, avg_log2FC))
down_selection3 <- row.names(step3_ChromVar %>% top_n(3, -avg_log2FC))
up_selection4 <- row.names(step4_ChromVar %>% top_n(3, avg_log2FC))
down_selection4 <- row.names(step4_ChromVar %>% top_n(3, -avg_log2FC))
FeaturePlot(
object = seurat,
features = c(up_selection1,down_selection1),
min.cutoff = 'q5',
max.cutoff = 'q95',
pt.size = 0.2,
ncol = 3
)
FeaturePlot(
object = seurat,
features = c(up_selection2,down_selection2),
min.cutoff = 'q5',
max.cutoff = 'q95',
pt.size = 0.2,
ncol = 3
)
FeaturePlot(
object = seurat,
features = c(up_selection3,down_selection3),
min.cutoff = 'q5',
max.cutoff = 'q95',
pt.size = 0.2,
ncol = 3
)
FeaturePlot(
object = seurat,
features = c(up_selection4,down_selection4),
min.cutoff = 'q5',
max.cutoff = 'q95',
pt.size = 0.2,
ncol = 3
)
FeaturePlot(
object = seurat,
order = T,
features = c("ENSG00000109320-LINE3205-NFKB1-D-N1",
"ENSG00000137709-LINE2649-POU2F3-D-N1",
"ENSG00000137265-LINE2748-IRF4-D-N1",
"ENSG00000115415-LINE3478-STAT1-D-N1",
"ENSG00000135363-LINE3654-LMO2-D-N1",
"ENSG00000196628-LINE303-TCF4-D-N3",
"ENSG00000177606-LINE517-JUN-D-N5",
"ENSG00000175592-LINE505-FOSL1-D-N3",
"ENSG00000162772-LINE458-ATF3-D-N1",
"ENSG00000142539-LINE1880-SPIB-D-N2"),
min.cutoff = 'q5',
max.cutoff = 'q95',
pt.size = 0.2,
ncol = 2
)
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: macOS Big Sur 10.16
##
## Matrix products: default
## BLAS/LAPACK: /Users/pauli/opt/anaconda3/envs/Motif_TF/lib/libopenblasp-r0.3.10.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] corrplot_0.84 factoextra_1.0.7.999 pheatmap_1.0.12 writexl_1.4.0 readxl_1.3.1 purrr_0.3.4 dplyr_1.0.7 chromVARmotifs_0.2.0 data.table_1.14.0 ggpubr_0.4.0 motifmatchr_1.10.0 chromVAR_1.10.0 patchwork_1.1.1 BSgenome.Hsapiens.UCSC.hg38_1.4.3 BSgenome_1.56.0 rtracklayer_1.48.0 Biostrings_2.56.0 XVector_0.28.0 GenomicRanges_1.40.0 GenomeInfoDb_1.24.2 IRanges_2.22.1 S4Vectors_0.26.0 BiocGenerics_0.34.0 TFBSTools_1.26.0 JASPAR2020_0.99.10 ggplot2_3.3.5 reshape_0.8.8 Signac_1.2.1 SeuratObject_4.0.2 Seurat_4.0.3 BiocStyle_2.16.1
##
## loaded via a namespace (and not attached):
## [1] utf8_1.2.1 reticulate_1.20 R.utils_2.10.1 tidyselect_1.1.1 poweRlaw_0.70.6 RSQLite_2.2.1 AnnotationDbi_1.50.3 htmlwidgets_1.5.3 grid_4.0.3 docopt_0.7.1 BiocParallel_1.22.0 Rtsne_0.15 munsell_0.5.0 codetools_0.2-17 ica_1.0-2 DT_0.16 future_1.21.0 miniUI_0.1.1.1 withr_2.4.2 colorspace_2.0-2 Biobase_2.48.0 knitr_1.30 ROCR_1.0-11 ggsignif_0.6.0 tensor_1.5 listenv_0.8.0 labeling_0.4.2 slam_0.1-47 GenomeInfoDbData_1.2.3 polyclip_1.10-0 bit64_4.0.5 farver_2.1.0 rprojroot_2.0.2 parallelly_1.26.1 vctrs_0.3.8 generics_0.1.0 xfun_0.18 lsa_0.73.2 ggseqlogo_0.1 R6_2.5.0 bitops_1.0-7 spatstat.utils_2.2-0
## [43] DelayedArray_0.14.0 assertthat_0.2.1 promises_1.2.0.1 scales_1.1.1 gtable_0.3.0 globals_0.14.0 goftest_1.2-2 seqLogo_1.54.3 rlang_0.4.11 RcppRoll_0.3.0 splines_4.0.3 rstatix_0.6.0 lazyeval_0.2.2 broom_0.7.2 spatstat.geom_2.2-0 BiocManager_1.30.10 yaml_2.2.1 reshape2_1.4.4 abind_1.4-5 backports_1.1.10 httpuv_1.6.1 tools_4.0.3 bookdown_0.21 nabor_0.5.0 ellipsis_0.3.2 spatstat.core_2.2-0 RColorBrewer_1.1-2 ggridges_0.5.3 Rcpp_1.0.6 plyr_1.8.6 zlibbioc_1.34.0 RCurl_1.98-1.2 rpart_4.1-15 deldir_0.2-10 pbapply_1.4-3 cowplot_1.1.1 zoo_1.8-9 haven_2.3.1 SummarizedExperiment_1.18.1 ggrepel_0.9.1 cluster_2.1.0 here_1.0.1
## [85] magrittr_2.0.1 scattermore_0.7 openxlsx_4.2.4 lmtest_0.9-38 RANN_2.6.1 SnowballC_0.7.0 fitdistrplus_1.1-5 matrixStats_0.59.0 hms_0.5.3 mime_0.11 evaluate_0.14 xtable_1.8-4 XML_3.99-0.3 rio_0.5.16 sparsesvd_0.2 gridExtra_2.3 compiler_4.0.3 tibble_3.1.2 KernSmooth_2.23-17 crayon_1.4.1 R.oo_1.24.0 htmltools_0.5.1.1 mgcv_1.8-33 later_1.2.0 tidyr_1.1.3 DBI_1.1.0 tweenr_1.0.1 MASS_7.3-53 car_3.0-10 Matrix_1.3-4 readr_1.4.0 R.methodsS3_1.8.1 igraph_1.2.6 forcats_0.5.0 pkgconfig_2.0.3 GenomicAlignments_1.24.0 TFMPvalue_0.0.8 foreign_0.8-80 plotly_4.9.4.1 spatstat.sparse_2.0-0 annotate_1.66.0 DirichletMultinomial_1.30.0
## [127] stringr_1.4.0 digest_0.6.27 sctransform_0.3.2 RcppAnnoy_0.0.18 pracma_2.2.9 CNEr_1.24.0 spatstat.data_2.1-0 cellranger_1.1.0 rmarkdown_2.5 leiden_0.3.8 fastmatch_1.1-0 uwot_0.1.10 curl_4.3.2 shiny_1.6.0 Rsamtools_2.4.0 gtools_3.9.2 lifecycle_1.0.0 nlme_3.1-150 jsonlite_1.7.2 carData_3.0-4 viridisLite_0.4.0 fansi_0.5.0 pillar_1.6.1 lattice_0.20-41 KEGGREST_1.28.0 fastmap_1.1.0 httr_1.4.2 survival_3.2-7 GO.db_3.11.4 glue_1.4.2 zip_2.1.1 qlcMatrix_0.9.7 png_0.1-7 bit_4.0.4 ggforce_0.3.2 stringi_1.6.2 blob_1.2.1 caTools_1.18.2 memoise_1.1.0 irlba_2.3.3 future.apply_1.7.0